Kapitel 6.9: Informationsvermittlung¶

Das Notebook ergänzt Kapitel 6.9 'Informationsvermittlung'.

Import¶

In [1]:
import pandas as pd
import numpy as np

from resources_statistics import *
from resources_geschichtslyrik import *

import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator

from tqdm.notebook import tqdm
In [2]:
meta = pd.read_json(r"../resources/meta.json")

Merkmale hinzufügen¶

In [3]:
meta['wissen_behandelt'] = [1 if pd.notna(x) else 0 for x in meta['wissen']]
meta['count'] = meta.query("corpus=='anth'").groupby('author_title')['author_title'].transform('count')

Korpora¶

Korpora erstellen¶

In [4]:
meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_anth_bin = binarize_meta(meta_anth)
In [5]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']

meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
In [6]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
In [7]:
sub_df = pd.DataFrame()
sub_names = ['Anthologien', 'Kanonisierte Moderne', 'Münchhausen-Kreis']
sub_metas = [meta_anth, meta_modcanon, meta_muench]

Merkmale berechnen¶

In [8]:
for this_name, this_meta in zip(sub_names, sub_metas):
    sub_df.loc[this_name, 'Jahr'] = round(this_meta['year'].mean(), 0)
    sub_df.loc[this_name, 'Texte'] = this_meta.shape[0]

    sub_df.loc[this_name, 'wissen_neutral'] = this_meta.query("wissen.isna()").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'wissen_positiv'] = this_meta.query("wissen == 1").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'wissen_negativ'] = this_meta.query("wissen == -1").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'wissen_ambivalent'] = this_meta.query("wissen == 0").shape[0]/this_meta.shape[0]

    sub_df.loc[this_name, 'uebereinstimmend'] = this_meta.query("verhaeltnis_wissen == 'übereinstimmend'").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'ergaenzend'] = this_meta.query("verhaeltnis_wissen == 'ergänzend'").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'abweichend_natuerlich'] = this_meta.query("verhaeltnis_wissen.str.contains('abweichend_natürlich')").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'abweichend_uebernatuerlich'] = this_meta.query("verhaeltnis_wissen.str.contains('abweichend_übernatürlich')").shape[0]/this_meta.shape[0]
    
    sub_df.loc[this_name, 'marker_person'] = this_meta.query("marker_person != '/'").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'marker_zeit'] = this_meta.query("marker_zeit != '/'").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'marker_ort'] = this_meta.query("marker_ort != '/'").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, 'marker_objekt'] = this_meta.query("marker_objekt != '/'").shape[0]/this_meta.shape[0]
    
    marker_all_counts = (this_meta[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']] != '/').sum(axis=1)
    marker_title_counts = [x.count('Titel') for x in this_meta[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)]
    marker_text_counts = [x.count('Text') for x in this_meta[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)]
    
    sub_df.loc[this_name, 'marker_all_per_text'] = marker_all_counts.sum()/this_meta.shape[0]
    sub_df.loc[this_name, 'marker_text_per_text'] = np.sum(marker_text_counts)/this_meta.shape[0]
    sub_df.loc[this_name, 'marker_title_per_text'] = np.sum(marker_title_counts)/this_meta.shape[0]
    
    sub_df.loc[this_name, '0marker'] = len([x for x in marker_all_counts if x == 0])/this_meta.shape[0]
    sub_df.loc[this_name, '1marker'] = len([x for x in marker_all_counts if x == 1])/this_meta.shape[0]
    sub_df.loc[this_name, '2marker'] = len([x for x in marker_all_counts if x == 2])/this_meta.shape[0]
    sub_df.loc[this_name, '3marker'] = len([x for x in marker_all_counts if x == 3])/this_meta.shape[0]
    sub_df.loc[this_name, '4marker'] = len([x for x in marker_all_counts if x == 4])/this_meta.shape[0]
In [9]:
round(sub_df, 4)
Out[9]:
Jahr Texte wissen_neutral wissen_positiv wissen_negativ wissen_ambivalent uebereinstimmend ergaenzend abweichend_natuerlich abweichend_uebernatuerlich ... marker_ort marker_objekt marker_all_per_text marker_text_per_text marker_title_per_text 0marker 1marker 2marker 3marker 4marker
Anthologien 1875.0 1850.0 0.9119 0.0405 0.0330 0.0146 0.1427 0.7476 0.0032 0.1070 ... 0.2768 0.5978 2.1227 2.0151 0.7038 0.0265 0.2286 0.4114 0.2627 0.0708
Kanonisierte Moderne 1903.0 113.0 0.8938 0.0177 0.0531 0.0354 0.1593 0.7257 0.0000 0.1150 ... 0.3097 0.4336 1.5929 1.1062 0.7080 0.0619 0.3982 0.4425 0.0796 0.0177
Münchhausen-Kreis 1905.0 140.0 0.8571 0.0286 0.0714 0.0429 0.1214 0.7714 0.0000 0.1071 ... 0.1357 0.6429 1.9071 1.8643 0.5643 0.0429 0.2929 0.4214 0.2000 0.0429

3 rows × 22 columns

Zeitverlauf¶

In [10]:
ts = pd.DataFrame()
ts.index = pd.Series(range(1850, 1919), name = 'year')
In [11]:
ts['text_count'] = meta_anth.groupby('year').size()
ts['text_count'] = ts['text_count'].fillna(0)
ts['text_sum'] = smooth(ts['text_count'], mode = 'sum')
In [12]:
ts['wissen_neutral_count'] = [meta_anth.query("year == @x and wissen.isna()").shape[0] for x in ts.index]
ts['wissen_neutral_sum'] = smooth(ts['wissen_neutral_count'], mode = 'sum')
ts['wissen_neutral_share_smoothed'] = ts['wissen_neutral_sum']/ts['text_sum']

ts['wissen_positiv_count'] = [meta_anth.query("year == @x and wissen == 1").shape[0] for x in ts.index]
ts['wissen_positiv_sum'] = smooth(ts['wissen_positiv_count'], mode = 'sum')
ts['wissen_positiv_share_smoothed'] = ts['wissen_positiv_sum']/ts['text_sum']

ts['wissen_negativ_count'] = [meta_anth.query("year == @x and wissen == -1").shape[0] for x in ts.index]
ts['wissen_negativ_sum'] = smooth(ts['wissen_negativ_count'], mode = 'sum')
ts['wissen_negativ_share_smoothed'] = ts['wissen_negativ_sum']/ts['text_sum']

ts['wissen_ambivalent_count'] = [meta_anth.query("year == @x and wissen == 0").shape[0] for x in ts.index]
ts['wissen_ambivalent_sum'] = smooth(ts['wissen_ambivalent_count'], mode = 'sum')
ts['wissen_ambivalent_share_smoothed'] = ts['wissen_ambivalent_sum']/ts['text_sum']

ts['uebereinstimmend_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen == 'übereinstimmend'").shape[0] for x in ts.index]
ts['uebereinstimmend_sum'] = smooth(ts['uebereinstimmend_count'], mode = 'sum')
ts['uebereinstimmend_share_smoothed'] = ts['uebereinstimmend_sum']/ts['text_sum']

ts['ergaenzend_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen == 'ergänzend'").shape[0] for x in ts.index]
ts['ergaenzend_sum'] = smooth(ts['ergaenzend_count'], mode = 'sum')
ts['ergaenzend_share_smoothed'] = ts['ergaenzend_sum']/ts['text_sum']

ts['abweichend_natuerlich_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen.str.contains('abweichend_natürlich')").shape[0] for x in ts.index]
ts['abweichend_natuerlich_sum'] = smooth(ts['abweichend_natuerlich_count'], mode = 'sum')
ts['abweichend_natuerlich_share_smoothed'] = ts['abweichend_natuerlich_sum']/ts['text_sum']

ts['abweichend_uebernatuerlich_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen.str.contains('abweichend_übernatürlich')").shape[0] for x in ts.index]
ts['abweichend_uebernatuerlich_sum'] = smooth(ts['abweichend_uebernatuerlich_count'], mode = 'sum')
ts['abweichend_uebernatuerlich_share_smoothed'] = ts['abweichend_uebernatuerlich_sum']/ts['text_sum']

ts['marker_person_count'] = [meta_anth.query("year == @x and marker_person != '/'").shape[0] for x in ts.index]
ts['marker_person_sum'] = smooth(ts['marker_person_count'], mode = 'sum')
ts['marker_person_share_smoothed'] = ts['marker_person_sum']/ts['text_sum']

ts['marker_zeit_count'] = [meta_anth.query("year == @x and marker_zeit != '/'").shape[0] for x in ts.index]
ts['marker_zeit_sum'] = smooth(ts['marker_zeit_count'], mode = 'sum')
ts['marker_zeit_share_smoothed'] = ts['marker_zeit_sum']/ts['text_sum']

ts['marker_ort_count'] = [meta_anth.query("year == @x and marker_ort != '/'").shape[0] for x in ts.index]
ts['marker_ort_sum'] = smooth(ts['marker_ort_count'], mode = 'sum')
ts['marker_ort_share_smoothed'] = ts['marker_ort_sum']/ts['text_sum']

ts['marker_objekt_count'] = [meta_anth.query("year == @x and marker_objekt != '/'").shape[0] for x in ts.index]
ts['marker_objekt_sum'] = smooth(ts['marker_objekt_count'], mode = 'sum')
ts['marker_objekt_share_smoothed'] = ts['marker_objekt_sum']/ts['text_sum']

for year in ts.index:
    meta_year = meta_anth.query("year == @year")
    marker_all_counts = (meta_year[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']] != '/').sum(axis=1)
    ts.loc[year, 'marker_all_count'] = marker_all_counts.sum()
    ts.loc[year, 'marker_title_count'] = np.sum([x.count('Titel') for x in meta_year[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)])
    ts.loc[year, 'marker_text_count'] = np.sum([x.count('Text') for x in meta_year[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)])
    ts.loc[year, '0marker_count'] = len([x for x in marker_all_counts if x == 0])
    ts.loc[year, '1marker_count'] = len([x for x in marker_all_counts if x == 1])
    ts.loc[year, '2marker_count'] = len([x for x in marker_all_counts if x == 2])
    ts.loc[year, '3marker_count'] = len([x for x in marker_all_counts if x == 3])
    ts.loc[year, '4marker_count'] = len([x for x in marker_all_counts if x == 4])

ts['marker_all_sum'] = smooth(ts['marker_all_count'], mode = 'sum')
ts['marker_all_per_text_smoothed'] = ts['marker_all_sum']/ts['text_sum']
ts['marker_title_sum'] = smooth(ts['marker_title_count'], mode = 'sum')
ts['marker_title_per_text_smoothed'] = ts['marker_title_sum']/ts['text_sum']
ts['marker_text_sum'] = smooth(ts['marker_text_count'], mode = 'sum')
ts['marker_text_per_text_smoothed'] = ts['marker_text_sum']/ts['text_sum']
ts['0marker_sum'] = smooth(ts['0marker_count'], mode = 'sum')
ts['0marker_share_smoothed'] = ts['0marker_sum']/ts['text_sum']
ts['1marker_sum'] = smooth(ts['1marker_count'], mode = 'sum')
ts['1marker_share_smoothed'] = ts['1marker_sum']/ts['text_sum']
ts['2marker_sum'] = smooth(ts['2marker_count'], mode = 'sum')
ts['2marker_share_smoothed'] = ts['2marker_sum']/ts['text_sum']
ts['3marker_sum'] = smooth(ts['3marker_count'], mode = 'sum')
ts['3marker_share_smoothed'] = ts['3marker_sum']/ts['text_sum']
ts['4marker_sum'] = smooth(ts['4marker_count'], mode = 'sum')
ts['4marker_share_smoothed'] = ts['4marker_sum']/ts['text_sum']

Wissen der Sprechinstanz¶

In [13]:
sub_df[[
    'wissen_neutral',
    'wissen_positiv',
    'wissen_negativ',
    'wissen_ambivalent'
]].T
Out[13]:
Anthologien Kanonisierte Moderne Münchhausen-Kreis
wissen_neutral 0.911892 0.893805 0.857143
wissen_positiv 0.040541 0.017699 0.028571
wissen_negativ 0.032973 0.053097 0.071429
wissen_ambivalent 0.014595 0.035398 0.042857
In [14]:
meta_plot = ts[[
    'wissen_neutral_share_smoothed', 
    'wissen_positiv_share_smoothed', 
    'wissen_negativ_share_smoothed',
    'wissen_ambivalent_share_smoothed',
]]
meta_plot.columns = [
    'neutral', 
    'wissend', 
    'unwissend',
    'ambivalent',
]

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['wissen_neutral', 'wissen_positiv', 'wissen_negativ', 'wissen_ambivalent']
)
fig.show()

wissen_behandelt¶

In [15]:
main_feature = 'wissen_behandelt'
In [16]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(15)
Out[16]:
wissen_behandelt                        1.000000
wissend                                 0.661295
unwissend                               0.594051
nation_volk_d_negativ                   0.218218
sprechinstanz_markiert                  0.198099
sprechinstanz_nicht_in_vergangenheit    0.169545
sprechakt_fragen_vorhanden              0.138801
geschichtsauffassung_negativ            0.113254
gegenwartsdominant                      0.111109
sprechakte_count                        0.091551
sprechakt_beschreiben_vorhanden         0.091547
geschichtsauffassung                    0.089392
gegenwartsbezug                         0.087330
zustand                                 0.082561
sprechakt_behaupten_vorhanden           0.076391
Name: wissen_behandelt, dtype: float64
In [17]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(15)
Out[17]:
geschichtsauffassung_positiv     -0.365676
nation_volk_d_positiv            -0.259097
sprechakt_erzaehlen_vorhanden    -0.144987
ereignis                         -0.115277
liebe_positiv                    -0.113312
ballade                          -0.112486
konkretheit                      -0.093721
in_hohem_mass_konkret            -0.090452
unbekanntes_individuum_negativ   -0.089703
tod_negativ                      -0.084034
mittelalter                      -0.068044
entity_count                     -0.062010
bekanntes_individuum_positiv     -0.058035
unbekanntes_individuum_count     -0.055522
religion_negativ                 -0.055331
Name: wissen_behandelt, dtype: float64
In [18]:
threshold = 0.15

bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [19]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [20]:
directly_related = [
    'wissend', 'unwissend'
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
    .sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[20]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
sprechinstanz_markiert 0.41 688/1687 0.75 123/163 0.28 0.28 0.35 0.42 0.42 72.60 0.0 0.0 0.20 40.0 71.46
sprechinstanz_nicht_in_vergangenheit 0.25 428/1687 0.52 85/163 0.19 0.19 0.27 0.35 0.35 53.18 0.0 0.0 0.17 78.0 45.20
geschichtsauffassung_positiv 0.43 26/61 0.00 0/16 -0.54 -0.55 -0.43 -0.30 -0.31 10.30 0.0 0.0 0.37 0.0 5.40
In [21]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[21]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
wissend 0.00 0/1687 0.46 75/163 0.39 0.38 0.46 0.54 0.53 809.03 0.00 0.00 0.66 0.0 6.61
unwissend 0.00 0/1687 0.37 61/163 0.29 0.30 0.37 0.45 0.45 652.86 0.00 0.00 0.59 0.0 5.37
nation_volk_d_negativ 0.08 4/49 0.29 2/7 -0.10 -0.14 0.20 0.55 0.61 2.67 0.10 0.16 0.22 2.0 0.75
entity_ambivalent 0.06 216/3599 0.09 28/317 -0.00 -0.00 0.03 0.06 0.06 4.00 0.05 0.05 0.03 28.0 19.75
kollektiv_positiv 0.37 361/974 0.39 35/90 -0.08 -0.09 0.02 0.12 0.12 0.12 0.73 0.73 0.01 35.0 33.50
stoffgebiet_neutral 0.19 436/2312 0.20 46/228 -0.04 -0.04 0.01 0.07 0.07 0.23 0.63 0.66 0.01 46.0 43.27
stoffgebiet_ambivalent 0.13 306/2312 0.14 32/228 -0.04 -0.04 0.01 0.06 0.06 0.12 0.73 0.76 0.01 32.0 30.34
entity_neutral 0.30 1062/3599 0.30 95/317 -0.05 -0.05 0.00 0.06 0.06 0.03 0.86 0.85 0.00 95.0 93.66
bekanntes_individuum_negativ 0.14 252/1850 0.14 24/175 -0.05 -0.05 0.00 0.05 0.05 0.00 0.97 1.00 0.00 24.0 23.85
entity_negativ 0.16 593/3599 0.16 51/317 -0.04 -0.05 -0.00 0.04 0.04 0.03 0.86 0.94 0.00 51.0 52.13
stoffgebiet_positiv 0.47 1079/2312 0.46 105/228 -0.07 -0.07 -0.01 0.06 0.06 0.03 0.86 0.89 0.00 105.0 106.28
kollektiv_negativ 0.25 246/974 0.24 22/90 -0.09 -0.10 -0.01 0.08 0.09 0.03 0.87 1.00 0.01 22.0 22.67
stoffgebiet_negativ 0.21 491/2312 0.20 45/228 -0.07 -0.07 -0.02 0.04 0.05 0.28 0.60 0.67 0.01 45.0 48.11
entity_positiv 0.48 1728/3599 0.45 143/317 -0.08 -0.09 -0.03 0.03 0.03 0.98 0.32 0.35 0.02 143.0 151.46
unbekanntes_individuum_positiv 0.35 206/596 0.31 11/35 -0.19 -0.19 -0.03 0.13 0.13 0.14 0.70 0.86 0.02 11.0 12.04
bekanntes_individuum_positiv 0.58 1065/1850 0.50 87/175 -0.15 -0.16 -0.08 -0.00 -0.00 4.02 0.04 0.05 0.04 87.0 75.44
unbekanntes_individuum_negativ 0.13 80/596 0.03 1/35 -0.15 -0.17 -0.11 -0.04 -0.03 3.30 0.07 0.07 0.07 1.0 4.49
nation_volk_d_positiv 0.78 38/49 0.43 3/7 -0.71 -0.73 -0.35 0.04 0.04 3.76 0.05 0.07 0.26 3.0 1.88
In [22]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]

results_a = relations_binbin(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_binbin(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']

round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[22]:
wenn_nicht_1850 wenn_nicht_detail_1850 wenn_ja_1850 wenn_ja_detail_1850 diff_1850 chi2_p_1850 phi_1850 wenn_nicht_1885 wenn_nicht_detail_1885 wenn_ja_1885 wenn_ja_detail_1885 diff_1885 chi2_p_1885 phi_1885 diff_of_diffs diff_of_phis
geschichtsauffassung_positiv 0.395 17/43 0.000 0/14 -0.395 0.005 0.372 0.500 9/18 0.000 0/2 -0.500 0.178 0.302 -0.105 -0.070
sprechinstanz_nicht_in_vergangenheit 0.261 319/1222 0.540 67/124 0.279 0.000 0.179 0.234 109/465 0.462 18/39 0.227 0.002 0.140 -0.052 -0.039
sprechinstanz_markiert 0.408 499/1222 0.734 91/124 0.326 0.000 0.190 0.406 189/465 0.821 32/39 0.414 0.000 0.223 0.089 0.033
In [23]:
results = relations_bincont(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = cont_comp_features
)
In [24]:
results
Out[24]:

wissend¶

In [25]:
main_feature = 'wissend'
In [26]:
meta_rel = meta_anth_bin.query("wissend == 1 or unwissend == 1").copy()
In [27]:
meta_rel.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[27]:
wissend                           1.000000
wissen                            1.000000
nation_volk_d_positiv             0.707107
liebe_positiv                     0.632456
ueberlieferung_positiv            0.523785
religion_positiv                  0.514286
kollektiv_positiv                 0.344265
sprechakt_behaupten_vorhanden     0.334729
geschichtsauffassung_negativ      0.282889
entity_positiv                    0.243058
nogenre                           0.238077
stoffgebiet_positiv               0.222711
reim                              0.209635
wissen_identisch                  0.194534
sprechakt_auffordern_vorhanden    0.192846
gegenwartsbezug                   0.183776
krieg_positiv                     0.181463
mittelraum_count                  0.171691
ende                              0.171373
sprechakte_count                  0.171168
Name: wissend, dtype: float64
In [28]:
meta_rel.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[28]:
unwissend                         -1.000000
tod_negativ                       -0.316228
krieg_negativ                     -0.302614
religion_negativ                  -0.298807
konkretheit                       -0.261153
entity_neutral                    -0.255536
in_hohem_mass_konkret             -0.238077
sprechinstanz_in_vergangenheit    -0.214824
ueberlieferung_negativ            -0.213201
politik_negativ                   -0.200446
liebe_negativ                     -0.200000
objektmarker_vorhanden            -0.183776
denkmal                           -0.176233
liebe                             -0.166221
rollengedicht                     -0.163107
stoffgebiet_negativ               -0.161601
wissen_ergaenzend                 -0.160328
sprechakt_beschreiben_vorhanden   -0.144416
tod                               -0.136444
anachronismus                     -0.116760
Name: wissend, dtype: float64
In [29]:
threshold = 0.2

bin_comp_features = get_features(meta_rel.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
bin_comp_features = bin_comp_features + ['rollengedicht']
cont_comp_features = get_features(meta_rel.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
cont_comp_features = cont_comp_features + ['stoffgebiet_neutral']
In [30]:
results = relations_binbin(
    meta = meta_rel, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [31]:
directly_related = ['krieg_negativ']
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
    .sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[31]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
kollektiv_positiv 0.16 4/25 0.49 24/49 0.11 0.13 0.33 0.53 0.51 7.65 0.01 0.01 0.32 4.0 9.46
sprechakt_behaupten_vorhanden 0.13 8/61 0.44 33/75 0.17 0.17 0.31 0.45 0.45 15.24 0.00 0.00 0.33 8.0 18.39
stoffgebiet_positiv 0.33 28/84 0.55 56/101 0.07 0.08 0.22 0.36 0.36 9.05 0.00 0.00 0.22 28.0 38.14
nogenre 0.18 11/61 0.40 30/75 0.08 0.07 0.22 0.37 0.35 7.71 0.01 0.01 0.24 11.0 18.39
reim 0.84 51/61 0.96 72/75 0.01 0.02 0.12 0.23 0.24 5.98 0.01 0.02 0.21 3.0 5.83
sprechinstanz_in_vergangenheit 0.33 20/61 0.15 11/75 -0.31 -0.32 -0.18 -0.04 -0.03 6.28 0.01 0.01 0.21 11.0 13.90
entity_neutral 0.43 51/119 0.22 32/144 -0.32 -0.32 -0.21 -0.09 -0.09 12.84 0.00 0.00 0.22 32.0 37.56
in_hohem_mass_konkret 0.82 50/61 0.60 45/75 -0.37 -0.37 -0.22 -0.07 -0.07 7.71 0.01 0.01 0.24 11.0 18.39
In [32]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[32]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
liebe_positiv 0.20 1/5 1.00 1/1 0.40 0.45 0.80 1.15 1.00 2.40 0.12 0.33 0.63 0.0 0.33
nation_volk_d_positiv 0.00 0/3 0.67 2/3 0.00 0.13 0.67 1.20 1.00 3.00 0.08 0.40 0.71 0.0 1.00
ueberlieferung_positiv 0.25 3/12 0.79 19/24 0.21 0.25 0.54 0.84 0.79 9.88 0.00 0.00 0.52 3.0 4.67
religion_positiv 0.29 2/7 0.80 8/10 0.07 0.10 0.51 0.93 0.90 4.50 0.03 0.06 0.51 2.0 2.88
geschichtsauffassung_negativ 0.17 1/6 0.43 3/7 -0.21 -0.21 0.26 0.73 0.71 1.04 0.31 0.56 0.28 1.0 1.85
entity_positiv 0.34 40/119 0.53 77/144 0.08 0.08 0.20 0.32 0.31 10.40 0.00 0.00 0.20 40.0 52.94
bekanntes_individuum_positiv 0.41 28/69 0.58 43/74 0.02 0.01 0.18 0.34 0.35 4.39 0.04 0.04 0.18 28.0 34.26
unbekanntes_individuum_positiv 0.26 5/19 0.38 5/13 -0.19 -0.21 0.12 0.45 0.46 0.53 0.47 0.70 0.13 5.0 4.06
kollektiv_negativ 0.20 5/25 0.27 13/49 -0.14 -0.13 0.07 0.26 0.27 0.38 0.54 0.58 0.07 5.0 6.08
bekanntes_individuum_negativ 0.10 7/69 0.16 12/74 -0.05 -0.05 0.06 0.17 0.17 1.14 0.29 0.33 0.09 7.0 9.17
entity_negativ 0.13 15/119 0.18 26/144 -0.03 -0.03 0.05 0.14 0.14 1.47 0.23 0.24 0.07 15.0 18.55
stoffgebiet_ambivalent 0.15 13/84 0.13 13/101 -0.13 -0.13 -0.03 0.08 0.08 0.26 0.61 0.67 0.04 13.0 11.81
entity_ambivalent 0.11 13/119 0.06 9/144 -0.12 -0.12 -0.05 0.02 0.02 1.86 0.17 0.19 0.08 9.0 9.95
stoffgebiet_neutral 0.26 22/84 0.18 18/101 -0.20 -0.20 -0.08 0.04 0.03 1.90 0.17 0.21 0.10 18.0 18.16
stoffgebiet_negativ 0.25 21/84 0.14 14/101 -0.22 -0.23 -0.11 0.00 -0.01 3.71 0.05 0.06 0.14 14.0 15.89
ueberlieferung_negativ 0.17 2/12 0.04 1/24 -0.38 -0.35 -0.12 0.10 0.08 1.64 0.20 0.25 0.21 1.0 1.00
rollengedicht 0.26 16/61 0.13 10/75 -0.27 -0.26 -0.13 0.01 0.01 3.62 0.06 0.08 0.16 10.0 11.66
religion_negativ 0.14 1/7 0.00 0/10 -0.43 -0.40 -0.14 0.12 0.00 1.52 0.22 0.41 0.30 0.0 0.41
politik_negativ 0.29 2/7 0.12 1/8 -0.59 -0.57 -0.16 0.24 0.25 0.60 0.44 0.57 0.20 1.0 1.40
krieg_negativ 0.32 9/28 0.08 3/36 -0.44 -0.43 -0.24 -0.04 -0.03 5.86 0.02 0.02 0.30 3.0 5.25
tod_negativ 0.25 2/8 0.00 0/4 -0.50 -0.55 -0.25 0.05 0.00 1.20 0.27 0.52 0.32 0.0 0.67
In [33]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]

results_a = relations_binbin(
    meta = meta_rel.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_binbin(
    meta = meta_rel.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']

round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[33]:
wenn_nicht_1850 wenn_nicht_detail_1850 wenn_ja_1850 wenn_ja_detail_1850 diff_1850 chi2_p_1850 phi_1850 wenn_nicht_1885 wenn_nicht_detail_1885 wenn_ja_1885 wenn_ja_detail_1885 diff_1885 chi2_p_1885 phi_1885 diff_of_diffs diff_of_phis
in_hohem_mass_konkret 0.864 38/44 0.603 35/58 -0.260 0.004 0.286 0.706 12/17 0.588 10/17 -0.118 0.473 0.123 0.143 -0.163
sprechakt_behaupten_vorhanden 0.091 4/44 0.431 25/58 0.340 0.000 0.373 0.235 4/17 0.471 8/17 0.235 0.151 0.246 -0.105 -0.127
nogenre 0.205 9/44 0.414 24/58 0.209 0.025 0.222 0.118 2/17 0.353 6/17 0.235 0.106 0.277 0.026 0.056
reim 0.864 38/44 0.966 56/58 0.102 0.058 0.188 0.765 13/17 0.941 16/17 0.176 0.146 0.249 0.075 0.061
stoffgebiet_positiv 0.381 24/63 0.587 44/75 0.206 0.016 0.205 0.190 4/21 0.462 12/26 0.271 0.051 0.284 0.065 0.079
entity_neutral 0.396 36/91 0.212 24/113 -0.183 0.004 0.200 0.536 15/28 0.258 8/31 -0.278 0.029 0.284 -0.094 0.084
sprechinstanz_in_vergangenheit 0.273 12/44 0.138 8/58 -0.135 0.089 0.168 0.471 8/17 0.176 3/17 -0.294 0.067 0.314 -0.159 0.146
kollektiv_positiv 0.158 3/19 0.425 17/40 0.267 0.043 0.264 0.167 1/6 0.778 7/9 0.611 0.020 0.600 0.344 0.336
In [34]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = 'sprechakt_behaupten_vorhanden',
    comp_features = ['in_hohem_mass_konkret', 'nogenre']
)

round(results, 2)
Out[34]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
in_hohem_mass_konkret 0.91 1369/1505 0.42 144/345 -0.55 -0.55 -0.49 -0.44 -0.44 456.48 0.0 0.0 0.50 136.0 62.85
nogenre 0.18 266/1505 0.45 156/345 0.22 0.22 0.28 0.33 0.33 120.92 0.0 0.0 0.26 156.0 78.70
In [35]:
results = relations_bincont(
    meta = meta_rel, 
    main_feature = main_feature,
    comp_features = [x for x in cont_comp_features if x != 'wissen']
)
In [36]:
round(results.sort_values(by = 'pointbiserialr_corr', ascending = False), 4) # .query("mannwhitneyu_p < 0.05")
Out[36]:
wenn_nicht a_merkmal=0 a_merkmal=1 a_merkmal=2 a_merkmal=3 a_merkmal>=4 wenn_ja b_merkmal=0 b_merkmal=1 b_merkmal=2 ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p meandiffs_ci_lower meandiffs_ci_bootstrap_lower meandiffs_ci_upper meandiffs_ci_bootstrap_upper
stoffgebiet_neutral 0.3607 0.69 [42/61] 0.26 [16/61] 0.05 [3/61] 0.0 [0/61] 0.0 [0/61] 0.2400 0.81 [61/75] 0.13 [10/75] 0.05 [4/75] ... -0.1077 0.2121 0.2121 0.2171 2556.0 0.1166 -0.3110 -0.3052 0.0697 0.0702
konkretheit 0.9098 0.0 [0/61] 0.82 [50/61] 0.0 [0/61] 0.0 [0/61] 0.0 [0/61] 0.7667 0.07 [5/75] 0.6 [45/75] 0.0 [0/75] ... -0.2612 0.0021 0.0021 0.5562 2817.5 0.0038 -0.2336 -0.2314 -0.0528 -0.0534

2 rows × 22 columns

In [37]:
meta_plot = meta_rel.copy()

for cont_comp_feature in cont_comp_features:
    mean_main = meta_plot[meta_plot[main_feature] == 1][cont_comp_feature].mean()
    mean_notmain = meta_plot[meta_plot[main_feature] == 0][cont_comp_feature].mean()
    label_main = f"Wissend<br>(Mittelwert = {round(mean_main, 2)})"
    label_notmain = f"Unwissend<br>(Mittelwert = {round(mean_notmain, 2)})"
    meta_plot['plot_legend'] = [label_main if x == 1 else label_notmain for x in meta_plot[main_feature]]
        
    fig = px.histogram(
        meta_plot,
        x = cont_comp_feature,
        color = 'plot_legend',
        histnorm = 'probability density',
        barmode = 'group',
        labels = {'plot_legend' : '', 
                  'entity_neutral' : 'Anzahl neutral bewertete Entitäten',
                  'stoffgebiet_positiv' : 'Anzahl positiv bewertete Stoffgebiete',
                  'entity_positiv' : 'Anzahl positiv bewertete Entitäten',
                 }
    )

    fig.update_layout(
        width = 700, height = 300,
        yaxis_title="Anteil",
        xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        legend=dict(font = dict(size=16), x=0.6, y = 0.92),
        bargap=0.1
    )
    # fig.write_image(f"plots/6.9 Wissen – {cont_comp_feature}.pdf")
    fig.show()
In [38]:
result_categories = ['wenn_nicht', 'wenn_ja', 'mannwhitneyu_p', 'pointbiserialr_corr',]

results_a = relations_bincont(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = [x for x in cont_comp_features if x != 'wissen']
)

results_b = relations_bincont(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = [x for x in cont_comp_features if x != 'wissen']
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[38]:
wenn_nicht_1850 wenn_ja_1850 mannwhitneyu_p_1850 pointbiserialr_corr_1850 wenn_nicht_1885 wenn_ja_1885 mannwhitneyu_p_1885 pointbiserialr_corr_1885 diff_of_corrs
konkretheit 0.911 0.776 0.000 -0.128 0.900 0.735 0.015 -0.137 -0.009
stoffgebiet_neutral 0.241 0.190 0.405 -0.020 0.316 0.412 0.670 0.030 0.050

Verhältnis zum historischen Wissen¶

In [39]:
meta_plot = ts[[
    'uebereinstimmend_share_smoothed', 
    'ergaenzend_share_smoothed', 
    'abweichend_natuerlich_share_smoothed',
    'abweichend_uebernatuerlich_share_smoothed',
]]
meta_plot.columns = [
    'übereinstimmend', 
    'ergänzend', 
    'abweichend_natürlich',
    'abweichend_übernatürlich',
]
# save_ts_data(meta_plot, prefix='06_09_Historisches_Wissen_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['uebereinstimmend', 'ergaenzend', 'abweichend_natuerlich', 'abweichend_uebernatuerlich']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.9 Verhältnis zum historischen Wissen im Zeitverlauf.pdf")
fig.show()
In [40]:
meta_anth_bin.query("1896<=year<=1898")[['author', 'title', 'wissen_identisch']].sort_values(by='wissen_identisch')
Out[40]:
author title wissen_identisch
1078 Gries, Wilhelm Zur Feier des hundertjährigen Geburtstages Kai... 0
1162 Scholz, Wilhelm von Der Strauchritter 0
1184 Wickenburg, Albrecht von Die Gelbschnäbel von Kolin 0
1120 Avenarius, Ferdinand Rolands Horn 0
1484 Schafheitlin, Adolf Des Oheims Erzählung 0
1460 Münchhausen, Börries von Der Abschied zu Fontainebleau 0
1448 Fischer, Julius Des Königs Traum 0
1414 Avenarius, Ferdinand Die Pest 0
1392 Ruseler, Georg Wittekind vor der Taufe 0
1384 Avenarius, Ferdinand Tejas Heimfahrt 0
1375 Renner, Gustav Cäsar 0
1374 Treller, Franz Die Hörige 0
1371 Avenarius, Ferdinand Der Breitenstein 0
1369 Renner, Gustav Siegrund und Helge 0
1290 Seydel, Max von Sphakteria 0
1187 Wickenburg, Albrecht von Das letzte Aufgebot 0
1497 Wickenburg, Albrecht von Des Sandwirts Heimkehr 0
1633 Rüthning, Paul Der Überfall 0
1755 Schönaich-Carolath, Emil von Lied des Gefangenen 0
1666 Münchhausen, Börries von Halfdan, Ragnars Sohn 0
1658 Heyse, Paul Die Mutter des Siegers 0
1588 Curti, Theod. Im Tale Schwyz 0
1606 Frey, Adolf Die Kappelkämpfer 0
1158 Münchhausen, Börries von Wir. Zu Helm und Schild geboren 1
1073 Delpy, Gustav Dem Andenken Kaiser Wilhelms I. 1
1074 Greif, Martin Dem Heldenkaiser 1
1109 Greif, Martin Königin Luise 1
1111 Greif, Martin Zur Bestattung Bismarcks im Sachsenwalde 1
1146 Wolff, Julius Das deutsche Heer 1
1076 Jordan, Wilhelm Was er vollbracht, ist wunderhaft 1
1077 Nießen, Joseph Zum 22. März 1897 [Festesklänge hallen wieder] 1
1080 Hoffs, Friedrich van Zur Gedächtnisfeier des Kaisers Wilhelms I. 1
1544 Saar, Ferdinand von Mozart 1
1485 Haaß, Robert Dem Vater des Vaterlandes 1
1480 Wildenbruch, Ernst von Inschrift an Villa Zirio in San Remo 1
1081 Greif, Martin Die Kornblume (Zu Kaiser Wilhelms Gedächtnis) 1
1075 Liliencron, A. von Es tönen die Glocken weit hin durch das Reich 1
1072 Diemar, Adamine von Zum Todestage der Kaiserin Augusta 1
1801 Fontane, Theodor Auf der Kuppe der Müggelberge 1
In [41]:
main_feature = 'wissen_identisch'
In [42]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[42]:
wissen_identisch                        1.000000
sprechakt_behaupten_vorhanden           0.439494
gegenwartsbezug                         0.403737
sprechinstanz_nicht_in_vergangenheit    0.379027
nogenre                                 0.349070
zustand                                 0.346266
gegenwartsdominant                      0.316185
nationalismus                           0.282898
ueberlieferung                          0.238014
ende                                    0.236246
neuzeit                                 0.226876
sprechinstanz_markiert                  0.225085
zeit_mitte                              0.223246
liebe_positiv                           0.220085
sprechakte_count                        0.216678
denkmal                                 0.211014
beginn                                  0.205790
sprechakt_beschreiben_vorhanden         0.195068
sonett                                  0.187559
wissen                                  0.180619
Name: wissen_identisch, dtype: float64
In [43]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[43]:
wissen_ergaenzend                -0.702107
konkretheit                      -0.609416
in_hohem_mass_konkret            -0.596218
sprechakt_erzaehlen_vorhanden    -0.445050
ballade                          -0.382442
ereignis                         -0.367976
entity_count                     -0.229154
words                            -0.204238
kleinraum_count                  -0.203893
mittelalter                      -0.176233
objektmarker_vorhanden           -0.166505
unbekanntes_individuum_count     -0.159578
sprechinstanz_in_vergangenheit   -0.157757
bekanntes_individuum_count       -0.155486
entity_neutral                   -0.149800
rollengedicht                    -0.142517
persmarker_vorhanden             -0.136207
empirisch                        -0.127596
entity_negativ                   -0.124995
antike                           -0.117027
Name: wissen_identisch, dtype: float64
In [44]:
threshold = 0.3

bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [45]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [46]:
directly_related = [
    'wissen_ergaenzend',
    'gegenwartsdominant', # related to gegenwartsbezug
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
    .sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[46]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
gegenwartsbezug 0.22 341/1586 0.74 195/264 0.47 0.47 0.52 0.58 0.58 301.56 0.0 0.0 0.40 69.0 76.49
sprechakt_behaupten_vorhanden 0.12 185/1586 0.61 160/264 0.43 0.43 0.49 0.55 0.55 357.34 0.0 0.0 0.44 104.0 49.23
zustand 0.35 548/1586 0.83 220/264 0.44 0.44 0.49 0.54 0.54 221.82 0.0 0.0 0.35 44.0 109.60
sprechinstanz_nicht_in_vergangenheit 0.21 330/1586 0.69 183/264 0.43 0.43 0.49 0.54 0.55 265.77 0.0 0.0 0.38 81.0 73.21
nogenre 0.17 267/1586 0.59 155/264 0.35 0.36 0.42 0.48 0.48 225.42 0.0 0.0 0.35 109.0 60.22
ereignis 0.84 1328/1586 0.40 105/264 -0.50 -0.50 -0.44 -0.38 -0.38 250.50 0.0 0.0 0.37 105.0 59.51
sprechakt_erzaehlen_vorhanden 0.84 1339/1586 0.31 81/264 -0.59 -0.60 -0.54 -0.48 -0.48 366.43 0.0 0.0 0.45 81.0 61.36
ballade 0.64 1011/1586 0.09 25/264 -0.59 -0.59 -0.54 -0.50 -0.50 270.58 0.0 0.0 0.38 25.0 116.16
in_hohem_mass_konkret 0.91 1446/1586 0.25 67/264 -0.71 -0.71 -0.66 -0.60 -0.60 657.63 0.0 0.0 0.60 67.0 48.09
In [47]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[47]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
gegenwartsdominant 0.10 157/1586 0.42 110/264 0.26 0.26 0.32 0.38 0.38 184.95 0.00 0.00 0.32 110.0 38.10
stoffgebiet_positiv 0.43 941/2189 0.69 243/351 0.21 0.21 0.26 0.31 0.31 83.72 0.00 0.00 0.18 108.0 163.62
bekanntes_individuum_positiv 0.54 992/1823 0.79 160/202 0.19 0.19 0.25 0.31 0.32 45.57 0.00 0.00 0.15 42.0 87.08
entity_positiv 0.46 1587/3485 0.66 284/431 0.16 0.16 0.20 0.25 0.25 63.70 0.00 0.00 0.13 147.0 205.92
kollektiv_positiv 0.34 315/914 0.54 81/150 0.11 0.11 0.20 0.28 0.28 21.05 0.00 0.00 0.14 69.0 55.83
entity_ambivalent 0.06 226/3485 0.04 18/431 -0.04 -0.04 -0.02 -0.00 -0.00 3.50 0.06 0.07 0.03 18.0 26.85
stoffgebiet_ambivalent 0.14 310/2189 0.08 28/351 -0.09 -0.09 -0.06 -0.03 -0.03 10.03 0.00 0.00 0.06 28.0 46.71
entity_negativ 0.17 598/3485 0.11 46/431 -0.10 -0.10 -0.06 -0.03 -0.03 11.74 0.00 0.00 0.05 46.0 70.88
bekanntes_individuum_negativ 0.14 262/1823 0.07 14/202 -0.11 -0.11 -0.07 -0.04 -0.04 8.55 0.00 0.00 0.06 14.0 27.53
kollektiv_negativ 0.26 241/914 0.18 27/150 -0.15 -0.15 -0.08 -0.02 -0.01 4.79 0.03 0.03 0.07 27.0 37.78
stoffgebiet_negativ 0.22 490/2189 0.13 46/351 -0.13 -0.13 -0.09 -0.05 -0.05 15.64 0.00 0.00 0.08 46.0 74.07
stoffgebiet_neutral 0.20 448/2189 0.10 34/351 -0.14 -0.14 -0.11 -0.07 -0.07 22.86 0.00 0.00 0.09 34.0 66.61
entity_neutral 0.31 1074/3485 0.19 83/431 -0.15 -0.16 -0.12 -0.08 -0.08 24.62 0.00 0.00 0.08 83.0 127.34
unbekanntes_individuum_negativ 0.13 81/614 0.00 0/17 -0.16 -0.16 -0.13 -0.11 -0.11 2.57 0.11 0.15 0.06 0.0 2.18
unbekanntes_individuum_positiv 0.35 215/614 0.12 2/17 -0.37 -0.39 -0.23 -0.07 -0.06 3.96 0.05 0.07 0.08 2.0 5.85
wissen_ergaenzend 0.87 1383/1586 0.00 0/264 -0.89 -0.89 -0.87 -0.86 -0.86 911.96 0.00 0.00 0.70 0.0 66.64
In [48]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]

results_a = relations_binbin(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_binbin(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']

round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[48]:
wenn_nicht_1850 wenn_nicht_detail_1850 wenn_ja_1850 wenn_ja_detail_1850 diff_1850 chi2_p_1850 phi_1850 wenn_nicht_1885 wenn_nicht_detail_1885 wenn_ja_1885 wenn_ja_detail_1885 diff_1885 chi2_p_1885 phi_1885 diff_of_diffs diff_of_phis
sprechakt_erzaehlen_vorhanden 0.859 1005/1170 0.312 55/176 -0.546 0.0 0.450 0.803 334/416 0.295 26/88 -0.507 0.0 0.426 0.039 -0.024
ereignis 0.860 1006/1170 0.420 74/176 -0.439 0.0 0.372 0.774 322/416 0.352 31/88 -0.422 0.0 0.350 0.018 -0.022
ballade 0.675 790/1170 0.125 22/176 -0.550 0.0 0.379 0.531 221/416 0.034 3/88 -0.497 0.0 0.380 0.053 0.001
in_hohem_mass_konkret 0.912 1067/1170 0.239 42/176 -0.673 0.0 0.596 0.911 379/416 0.284 25/88 -0.627 0.0 0.597 0.046 0.001
zustand 0.319 373/1170 0.807 142/176 0.488 0.0 0.339 0.421 175/416 0.886 78/88 0.466 0.0 0.354 -0.022 0.015
nogenre 0.142 166/1170 0.540 95/176 0.398 0.0 0.339 0.243 101/416 0.682 60/88 0.439 0.0 0.357 0.041 0.018
sprechakt_behaupten_vorhanden 0.111 130/1170 0.591 104/176 0.480 0.0 0.427 0.132 55/416 0.636 56/88 0.504 0.0 0.462 0.024 0.035
gegenwartsbezug 0.225 263/1170 0.727 128/176 0.502 0.0 0.373 0.188 78/416 0.761 67/88 0.574 0.0 0.481 0.071 0.108
sprechinstanz_nicht_in_vergangenheit 0.226 264/1170 0.693 122/176 0.468 0.0 0.349 0.159 66/416 0.693 61/88 0.535 0.0 0.467 0.067 0.119
In [49]:
results = relations_bincont(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = cont_comp_features
)
In [50]:
round(results.sort_values(by = 'pointbiserialr_corr', ascending = False), 2)
Out[50]:
wenn_nicht a_merkmal=0 a_merkmal=1 a_merkmal=2 a_merkmal=3 a_merkmal>=4 wenn_ja b_merkmal=0 b_merkmal=1 b_merkmal=2 ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p meandiffs_ci_lower meandiffs_ci_bootstrap_lower meandiffs_ci_upper meandiffs_ci_bootstrap_upper
konkretheit 0.96 0.0 [1/1586] 0.91 [1446/1586] 0.0 [0/1586] 0.0 [0/1586] 0.0 [0/1586] 0.58 0.09 [24/264] 0.25 [67/264] 0.0 [0/264] ... -0.61 0.0 0.0 1.67 348674.5 0.0 -0.4 -0.41 -0.35 -0.34

1 rows × 22 columns

In [51]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = 'wissen_ergaenzend',
    comp_features = ['in_hohem_mass_konkret']
)

results.T
Out[51]:
in_hohem_mass_konkret
wenn_nicht 0.513919
wenn_nicht_detail 240/467
wenn_ja 0.920463
wenn_ja_detail 1273/1383
diff_low_bootstrap 0.364354
diff_low 0.359023
diff 0.406544
diff_high 0.454065
diff_high_bootstrap 0.452306
chi2 387.308792
chi2_p 0.0
fisher_p 0.0
phi 0.457554
min_real 110.0
min_expected 85.06973

Geschichtsmarker¶

In [52]:
meta_plot = ts[[
    'marker_person_share_smoothed', 
    'marker_zeit_share_smoothed', 
    'marker_ort_share_smoothed',
    'marker_objekt_share_smoothed',
]]
meta_plot.columns = [
    'Person', 
    'Zeit', 
    'Ort',
    'Objekt',
]
# save_ts_data(meta_plot, prefix='06_09_Geschichtsmarker_einzeln_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt',]
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.9 Geschichtsmarker-Typen im Zeitverlauf (einzeln).pdf")
fig.show()
In [53]:
meta_plot = ts[[
    'marker_all_per_text_smoothed', 
    'marker_text_per_text_smoothed', 
    'marker_title_per_text_smoothed',
]]
meta_plot.columns = [
    'Alle Marker', 
    'Marker im Text', 
    'Marker im Titel',
]
save_ts_data(meta_plot, prefix='06_09_Geschichtsmarker_gesamt_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Markertypen pro Text',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['marker_all_per_text', 'marker_text_per_text', 'marker_title_per_text']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.9 Geschichtsmarker-Typen im Zeitverlauf (gesamt).pdf")
fig.show()
In [54]:
meta_plot = ts[[
    '0marker_share_smoothed', 
    '1marker_share_smoothed', 
    '2marker_share_smoothed', 
    '3marker_share_smoothed', 
    '4marker_share_smoothed', 
]]
meta_plot.columns = [
    '0 Markertypen', 
    '1 Markertypen', 
    '2 Markertypen',
    '3 Markertypen',
    '4 Markertypen',
]

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['0marker', '1marker', '2marker', '3marker', '4marker',]
)
fig.show()
In [55]:
main_feature = 'marker_count'
In [56]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[56]:
marker_count                     1.000000
objektmarker_vorhanden           0.551354
ortmarker_vorhanden              0.537889
zeitmarker_vorhanden             0.510047
persmarker_vorhanden             0.442834
words                            0.300117
zeitebenen                       0.294403
nation_volk_d_positiv            0.230049
bekanntes_individuum_count       0.218128
kleinraum_count                  0.182637
heroismus                        0.168803
antike                           0.167671
fixierbarkeit                    0.145121
sprechakt_erzaehlen_vorhanden    0.137233
ereignis                         0.135113
entity_count                     0.134463
anachronismus                    0.131773
mittelraum_count                 0.124431
ueberlieferung                   0.120582
geschichtsauffassung_positiv     0.113113
Name: marker_count, dtype: float64
In [57]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[57]:
neuzeit                          -0.175863
liebe_negativ                    -0.175587
ende                             -0.174842
zeit_mitte                       -0.170904
beginn                           -0.163313
religion_positiv                 -0.157501
unbekanntes_individuum_count     -0.131766
ueberlieferung_negativ           -0.125123
tod_positiv                      -0.106239
year_predict_ages_mean           -0.092459
ueberlieferung_positiv           -0.090329
wissen_identisch                 -0.088696
lied                             -0.081683
year                             -0.080969
politik_negativ                  -0.073416
decade                           -0.071780
nogenre                          -0.071700
behandelt_deutschen_mittelraum   -0.070754
sprechakt_behaupten_vorhanden    -0.070608
tod_negativ                      -0.069347
Name: marker_count, dtype: float64
In [58]:
threshold = 0.15

bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [59]:
results = relations_contbin(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [60]:
directly_related = [
    'objektmarker_vorhanden', 'ortmarker_vorhanden', 'zeitmarker_vorhanden', 'persmarker_vorhanden',
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("mannwhitneyu_p < 0.05 and (pointbiserialr_corr >= @threshold or pointbiserialr_corr <= -@threshold)")
    .sort_values(by = 'pointbiserialr_corr', ascending = False)
)
round(results_filtered, 2)
Out[60]:
wenn marker_count = 0: Anteil Texte mit Feature = ... wenn marker_count = 1: Anteil Texte mit Feature = ... wenn marker_count = 2: Anteil Texte mit Feature = ... wenn marker_count = 3: Anteil Texte mit Feature = ... wenn marker_count > 3: Anteil Texte mit Feature = ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p
heroismus 0.14285714285714285 [7/49] 0.1867612293144208 [79/423] 0.266754270696452 [203/761] 0.37037037037037035 [180/486] 0.4122137404580153 [54/131] 0.17 0.00 0.00 -0.38 274934.0 0.00
antike 0.02040816326530612 [1/49] 0.07328605200945626 [31/423] 0.11038107752956636 [84/761] 0.18930041152263374 [92/486] 0.2595419847328244 [34/131] 0.17 0.00 0.00 -0.51 142243.0 0.00
religion_positiv 0.5 [1/2] 0.6515151515151515 [43/66] 0.46956521739130436 [54/115] 0.417910447761194 [28/67] 0.4117647058823529 [7/17] -0.16 0.01 0.01 0.32 10512.0 0.01
neuzeit 0.7755102040816326 [38/49] 0.6572104018912529 [278/423] 0.5374507227332457 [409/761] 0.46502057613168724 [226/486] 0.3816793893129771 [50/131] -0.18 0.00 0.00 0.36 506026.0 0.00
In [61]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pointbiserialr_corr', ascending=False), 2)
Out[61]:
wenn marker_count = 0: Anteil Texte mit Feature = ... wenn marker_count = 1: Anteil Texte mit Feature = ... wenn marker_count = 2: Anteil Texte mit Feature = ... wenn marker_count = 3: Anteil Texte mit Feature = ... wenn marker_count > 3: Anteil Texte mit Feature = ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p
objektmarker_vorhanden 0.0 [0/49] 0.18912529550827423 [80/423] 0.6254927726675427 [476/761] 0.8621399176954733 [419/486] 1.0 [131/131] 0.55 0.00 0.00 -1.36 155638.5 0.00
ortmarker_vorhanden 0.0 [0/49] 0.054373522458628844 [23/423] 0.15768725361366623 [120/761] 0.4897119341563786 [238/486] 1.0 [131/131] 0.54 0.00 0.00 -1.41 122779.0 0.00
zeitmarker_vorhanden 0.0 [0/49] 0.13947990543735225 [59/423] 0.3455978975032852 [263/761] 0.6707818930041153 [326/486] 1.0 [131/131] 0.51 0.00 0.00 -1.19 183207.0 0.00
persmarker_vorhanden 0.0 [0/49] 0.6170212765957447 [261/423] 0.871222076215506 [663/761] 0.977366255144033 [475/486] 1.0 [131/131] 0.44 0.00 0.00 -1.36 91982.5 0.00
nation_volk_d_positiv 0.0 [0/2] 0.7857142857142857 [11/14] 0.6470588235294118 [11/17] 0.7647058823529411 [13/17] 1.0 [6/6] 0.23 0.09 0.09 -0.54 229.5 0.14
bekanntes_individuum_negativ 0.0 [0/13] 0.10984848484848485 [29/264] 0.2048611111111111 [118/576] 0.2074074074074074 [84/405] 0.21100917431192662 [23/109] 0.08 0.00 0.00 -0.22 125235.5 0.00
entity_positiv 0.5306122448979592 [26/49] 0.46335697399527187 [196/423] 0.4533508541392904 [345/761] 0.42386831275720166 [206/486] 0.40458015267175573 [53/131] 0.07 0.00 0.89 0.01 225716.0 0.93
entity_negativ 0.14285714285714285 [7/49] 0.2293144208037825 [97/423] 0.28252299605781866 [215/761] 0.26954732510288065 [131/486] 0.25190839694656486 [33/131] 0.07 0.00 0.06 -0.10 294935.5 0.07
entity_ambivalent 0.04081632653061224 [2/49] 0.08983451536643026 [38/423] 0.11432325886990802 [87/761] 0.09670781893004116 [47/486] 0.13740458015267176 [18/131] 0.06 0.01 0.13 -0.12 148196.0 0.19
stoffgebiet_neutral 0.08163265306122448 [4/49] 0.17494089834515367 [74/423] 0.18528252299605782 [141/761] 0.16666666666666666 [81/486] 0.183206106870229 [24/131] 0.04 0.06 0.51 -0.04 230964.5 0.63
stoffgebiet_ambivalent 0.14285714285714285 [7/49] 0.15839243498817968 [67/423] 0.15768725361366623 [120/761] 0.15432098765432098 [75/486] 0.22137404580152673 [29/131] 0.04 0.11 0.27 -0.07 221141.5 0.37
kollektiv_positiv 0.375 [9/24] 0.4419889502762431 [80/181] 0.41590214067278286 [136/327] 0.44933920704845814 [102/227] 0.4857142857142857 [34/70] 0.03 0.38 0.38 -0.06 81887.5 0.43
stoffgebiet_positiv 0.4489795918367347 [22/49] 0.425531914893617 [180/423] 0.46254927726675427 [352/761] 0.4382716049382716 [213/486] 0.366412213740458 [48/131] 0.02 0.35 0.76 0.02 349197.5 0.86
entity_neutral 0.42857142857142855 [21/49] 0.29550827423167847 [125/423] 0.266754270696452 [203/761] 0.27983539094650206 [136/486] 0.25190839694656486 [33/131] -0.01 0.81 0.11 0.08 282883.0 0.18
kollektiv_negativ 0.25 [6/24] 0.32044198895027626 [58/181] 0.3302752293577982 [108/327] 0.29955947136563876 [68/227] 0.2857142857142857 [20/70] -0.01 0.68 0.68 0.03 75455.0 0.63
unbekanntes_individuum_negativ 0.045454545454545456 [1/22] 0.232 [29/125] 0.15083798882681565 [27/179] 0.16483516483516483 [15/91] 0.17647058823529413 [3/17] -0.02 0.74 0.74 0.04 14041.5 0.54
stoffgebiet_negativ 0.32653061224489793 [16/49] 0.26004728132387706 [110/423] 0.22601839684625494 [172/761] 0.19958847736625515 [97/486] 0.183206106870229 [24/131] -0.02 0.45 0.01 0.15 310503.0 0.01
bekanntes_individuum_positiv 0.6153846153846154 [8/13] 0.696969696969697 [184/264] 0.7065972222222222 [407/576] 0.6765432098765433 [274/405] 0.6422018348623854 [70/109] -0.03 0.30 0.30 0.06 206687.5 0.29
unbekanntes_individuum_positiv 0.5454545454545454 [12/22] 0.424 [53/125] 0.39106145251396646 [70/179] 0.46153846153846156 [42/91] 0.29411764705882354 [5/17] -0.03 0.49 0.49 0.07 23530.0 0.62
liebe_negativ 0.0 [0/1] 0.3 [3/10] 0.08108108108108109 [3/37] 0.09090909090909091 [1/11] 0.0 [0/4] -0.18 0.17 0.17 0.60 256.5 0.14
In [62]:
result_categories = ['pointbiserialr_corr', 'mannwhitneyu_p']

results_a = relations_contbin(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_contbin(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[62]:
pointbiserialr_corr_1850 mannwhitneyu_p_1850 pointbiserialr_corr_1885 mannwhitneyu_p_1885 diff_of_corrs
religion_positiv -0.133 0.032 -0.256 0.076 -0.123
antike 0.195 0.000 0.085 0.090 -0.111
heroismus 0.182 0.000 0.104 0.024 -0.078
neuzeit -0.162 0.000 -0.179 0.000 -0.017
In [63]:
results = relations_contbin_ratings(meta_anth_bin, main_feature)
results.sort_values(by = 'pointbiserialr_corr')
  0%|          | 0/14 [00:00<?, ?it/s]
Out[63]:
wenn marker_count = 1: Anteil mit Feature = ... wenn marker_count = 2: Anteil mit Feature = ... wenn marker_count = 3: Anteil mit Feature = ... wenn marker_count = 4: Anteil mit Feature = ... pointbiserialr_corr pointbiserialr_p mannwhitneyu_stat mannwhitneyu_p
entity_neutral 0.3234 [271/838] 0.2741 [440/1605] 0.2943 [319/1084] 0.2805 [85/303] -0.040430 0.011399 1528628.0 0.027589
bekanntes_individuum_positiv 0.5917 [213/360] 0.5807 [500/861] 0.548 [337/615] 0.5345 [93/174] -0.038488 0.083357 481194.0 0.077973
stoffgebiet_negativ 0.2234 [126/564] 0.2058 [214/1040] 0.2171 [152/700] 0.1547 [28/181] -0.033441 0.091989 516954.0 0.159493
unbekanntes_individuum_positiv 0.3403 [65/191] 0.3425 [87/254] 0.3594 [46/128] 0.2083 [5/24] -0.030496 0.444446 43808.0 0.590164
kollektiv_negativ 0.262 [60/229] 0.2642 [112/424] 0.2386 [68/285] 0.2366 [22/93] -0.009726 0.751324 104848.0 0.661277
stoffgebiet_positiv 0.4805 [271/564] 0.4625 [481/1040] 0.4671 [327/700] 0.4475 [81/181] -0.009321 0.638676 794270.5 0.627611
unbekanntes_individuum_negativ 0.1571 [30/191] 0.126 [32/254] 0.1172 [15/128] 0.125 [3/24] -0.001769 0.964622 21887.5 0.789823
entity_positiv 0.4773 [400/838] 0.4822 [774/1605] 0.4825 [523/1084] 0.4587 [139/303] 0.003371 0.832993 1918894.0 0.862723
kollektiv_positiv 0.3843 [88/229] 0.3679 [156/424] 0.3789 [108/285] 0.3763 [35/93] 0.013102 0.669469 133755.0 0.746670
stoffgebiet_ambivalent 0.1294 [73/564] 0.1308 [136/1040] 0.1271 [89/700] 0.1823 [33/181] 0.020629 0.298692 382210.0 0.397510
entity_negativ 0.148 [124/838] 0.1763 [283/1605] 0.1633 [177/1084] 0.1749 [53/303] 0.024670 0.122696 1087238.0 0.176100
stoffgebiet_neutral 0.1667 [94/564] 0.201 [209/1040] 0.1886 [132/700] 0.2155 [39/181] 0.028788 0.146926 514505.5 0.177607
entity_ambivalent 0.0513 [43/838] 0.0673 [108/1605] 0.06 [65/1084] 0.0858 [26/303] 0.031513 0.048621 475987.0 0.084288
bekanntes_individuum_negativ 0.0861 [31/360] 0.1498 [129/861] 0.1463 [90/615] 0.1494 [26/174] 0.053802 0.015464 261169.5 0.019958
In [64]:
results = relations_contcont(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = cont_comp_features
)
In [65]:
directly_related = [
    'beginn', 'ende', # related to zeit_mitte
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("pearsonr_p < 0.05 and (pearsonr_corr >= @threshold or pearsonr_corr <= -@threshold)")
    .sort_values(by = 'pearsonr_corr', ascending = False)
)
round(results_filtered, 2)
Out[65]:
wenn marker_count = 0: Mittelwert Feature = ... wenn marker_count = 1: Mittelwert Feature = ... wenn marker_count = 2: Mittelwert Feature = ... wenn marker_count = 3: Mittelwert Feature = ... wenn marker_count > 3: Mittelwert Feature = ... pearsonr_corr pearsonr_p
words 208.34 223.73 307.93 361.20 470.13 0.30 0.0
zeitebenen 1.59 1.74 1.96 2.25 2.63 0.29 0.0
bekanntes_individuum_count 0.31 0.85 1.13 1.27 1.33 0.22 0.0
kleinraum_count 0.41 0.52 0.66 0.76 0.84 0.18 0.0
zeit_mitte 1667.03 1436.41 1299.61 1200.74 1072.13 -0.17 0.0
In [66]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pearsonr_corr', ascending=False), 2)
Out[66]:
wenn marker_count = 0: Mittelwert Feature = ... wenn marker_count = 1: Mittelwert Feature = ... wenn marker_count = 2: Mittelwert Feature = ... wenn marker_count = 3: Mittelwert Feature = ... wenn marker_count > 3: Mittelwert Feature = ... pearsonr_corr pearsonr_p
beginn 1660.52 1423.46 1282.06 1194.55 1059.44 -0.16 0.0
ende 1673.54 1449.36 1317.15 1206.93 1084.81 -0.17 0.0
In [67]:
meta_plot = meta_anth_bin.copy()
meta_plot = meta_plot.sort_values(by='zeitebenen')
meta_plot['words'] = meta_plot['words'].clip(upper=1250)
meta_plot['zeit_mitte'] = meta_plot['zeit_mitte'].clip(lower=0)

for cont_comp_feature in results_filtered.index:
    fig = px.box(
        meta_plot,
        x = main_feature,
        y = cont_comp_feature,
        labels = {'zeitebenen' : 'Anzahl Zeitebenen',
                  'marker_count' : 'Anzahl Geschichtsmarker-Typen',
                  'zeit_mitte' : 'Mitte der dominanten Zeitebene',
                  'words' : 'Anzahl Wörter',
                  'kleinraum_count' : 'Anzahl behandelte Kleinräume',
                  'bekanntes_individuum_count' : 'Anzahl behandelte<br>bekannte Individuen'
                 },
                 color_discrete_sequence=['grey']
    )
    if cont_comp_feature != 'words' and cont_comp_feature != 'zeit_mitte':
        fig.update_traces(boxmean=True)
    fig.update_layout(
        width = 700, height = 300,
        xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        legend=dict(font = dict(size=16), x=0.61, y = 0.88),
        bargap=0.1
    )
    fig = update_fig_for_publication(fig, make_grey=True)
    fig.write_image(f"plots/6.9 Geschichtsmarker – {cont_comp_feature}.pdf")
    fig.show()
In [68]:
result_categories = ['pearsonr_corr', 'pearsonr_p']

results_a = relations_contcont(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_contcont(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pearsonr_corr_1885'].abs() - results_merged['pearsonr_corr_1850'].abs()

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[68]:
pearsonr_corr_1850 pearsonr_p_1850 pearsonr_corr_1885 pearsonr_p_1885 diff_of_corrs
zeit_mitte -0.197 0.0 -0.093 0.037 -0.104
words 0.299 0.0 0.268 0.000 -0.031
kleinraum_count 0.179 0.0 0.160 0.000 -0.019
zeitebenen 0.299 0.0 0.285 0.000 -0.014
bekanntes_individuum_count 0.190 0.0 0.254 0.000 0.064

Textlänge¶

In [69]:
meta_anth_bin['period'] = [1 if year >= 1885 else 0 for year in meta_anth_bin['year']]

for interval in [(0, 9999999), (0, 200), (201, 400), (401, 9999999)]:
    interval_start = interval[0]
    interval_stop = interval[1]
    meta_size = meta_anth_bin.query("@interval_start <= words <= @interval_stop")
    results = relations_bincont(
        meta = meta_size, 
        main_feature = 'period',
        comp_features = ['marker_count']
    )
    
    # wenn_nicht = 1850–1884, wenn_ja = 1885–1918
    print(f"Texte mit {interval_start}–{interval_stop} Wörtern")
    print(f"1850–1884 : {meta_size.query('period == 0').shape[0]}")
    print(f"1885–1918 : {meta_size.query('period == 1').shape[0]}")
    print(results[['wenn_nicht', 'wenn_ja', 'mannwhitneyu_stat', 'mannwhitneyu_p', 'pointbiserialr_corr']].T)
    print("\n")
Texte mit 0–9999999 Wörtern
1850–1884 : 1221
1885–1918 : 502
                      marker_count
wenn_nicht                2.211302
wenn_ja                   1.972112
mannwhitneyu_stat    348270.500000
mannwhitneyu_p            0.000003
pointbiserialr_corr      -0.117290


Texte mit 0–200 Wörtern
1850–1884 : 368
1885–1918 : 212
                     marker_count
wenn_nicht               1.864130
wenn_ja                  1.698113
mannwhitneyu_stat    42849.000000
mannwhitneyu_p           0.037070
pointbiserialr_corr     -0.087557


Texte mit 201–400 Wörtern
1850–1884 : 523
1885–1918 : 198
                     marker_count
wenn_nicht               2.216061
wenn_ja                  2.121212
mannwhitneyu_stat    54524.500000
mannwhitneyu_p           0.238186
pointbiserialr_corr     -0.049897


Texte mit 401–9999999 Wörtern
1850–1884 : 330
1885–1918 : 92
                     marker_count
wenn_nicht               2.590909
wenn_ja                  2.282609
mannwhitneyu_stat    17778.000000
mannwhitneyu_p           0.007912
pointbiserialr_corr     -0.140326


Einzelne Markertypen¶

In [70]:
for marker_type in ['persmarker_vorhanden', 'zeitmarker_vorhanden', 
                    'ortmarker_vorhanden', 'objektmarker_vorhanden']:
    results = relations_bincont(
        meta = meta_anth_bin, 
        main_feature = marker_type,
        comp_features = ['zeit_mitte']
    )
    print(marker_type)
    print(round(results[['wenn_nicht', 'wenn_ja', 'pointbiserialr_corr', 
                         'mannwhitneyu_stat', 'mannwhitneyu_p']].T, 2))
    print("\n")
persmarker_vorhanden
                     zeit_mitte
wenn_nicht              1489.35
wenn_ja                 1258.76
pointbiserialr_corr       -0.13
mannwhitneyu_stat     302450.00
mannwhitneyu_p             0.00


zeitmarker_vorhanden
                     zeit_mitte
wenn_nicht              1253.09
wenn_ja                 1359.84
pointbiserialr_corr        0.08
mannwhitneyu_stat     340009.50
mannwhitneyu_p             0.00


ortmarker_vorhanden
                     zeit_mitte
wenn_nicht              1341.97
wenn_ja                 1183.58
pointbiserialr_corr       -0.10
mannwhitneyu_stat     346646.00
mannwhitneyu_p             0.55


objektmarker_vorhanden
                     zeit_mitte
wenn_nicht              1472.40
wenn_ja                 1181.06
pointbiserialr_corr       -0.21
mannwhitneyu_stat     564747.50
mannwhitneyu_p             0.00


In [71]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = 'heroismus',
    comp_features = [
        'persmarker_vorhanden', 'zeitmarker_vorhanden', 
        'ortmarker_vorhanden', 'objektmarker_vorhanden'
    ]
)
round(results, 2)
Out[71]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
persmarker_vorhanden 0.80 1068/1327 0.88 462/523 0.04 0.04 0.08 0.11 0.11 16.18 0.00 0.00 0.09 61.0 90.46
zeitmarker_vorhanden 0.41 542/1327 0.45 237/523 -0.01 -0.01 0.04 0.09 0.10 3.08 0.08 0.08 0.04 237.0 220.23
ortmarker_vorhanden 0.23 306/1327 0.39 206/523 0.12 0.12 0.16 0.21 0.21 49.97 0.00 0.00 0.16 206.0 144.74
objektmarker_vorhanden 0.58 770/1327 0.64 336/523 0.01 0.01 0.06 0.11 0.11 6.03 0.01 0.02 0.06 187.0 210.33

Korpora¶

In [72]:
meta_anth_early = meta_anth_bin.query("1850<=year<=1884")
meta_muench_bin = binarize_meta(meta_muench)
mannwhitneyu(meta_anth_early['marker_count'], meta_muench_bin['marker_count'])
Out[72]:
MannwhitneyuResult(statistic=109242.5, pvalue=0.001039161435277055)